import pandas as pdEDA: 2015 Street Tree Census - Tree Data
Download the data:
import zipfilewith zipfile.ZipFile("ny-2015-street-tree-census-tree-data.zip", "r") as z:
# Получение списка всех файлов в архиве
all_files = z.namelist()
# Выбор нужного файла
file_to_extract = (
"2015-street-tree-census-tree-data.csv" # замените на имя вашего файла
)
if file_to_extract in all_files:
with z.open(file_to_extract) as f:
data = pd.read_csv(f)Data preview:
data.head()| tree_id | block_id | created_at | tree_dbh | stump_diam | curb_loc | status | health | spc_latin | spc_common | ... | boro_ct | state | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 180683 | 348711 | 2015-08-27T00:00:00.000 | 3 | 0 | OnCurb | Alive | Fair | Acer rubrum | red maple | ... | 4073900 | New York | 40.723092 | -73.844215 | 1027431.148 | 202756.7687 | 29.0 | 739.0 | 4052307.0 | 4.022210e+09 |
| 1 | 200540 | 315986 | 2015-09-03T00:00:00.000 | 21 | 0 | OnCurb | Alive | Fair | Quercus palustris | pin oak | ... | 4097300 | New York | 40.794111 | -73.818679 | 1034455.701 | 228644.8374 | 19.0 | 973.0 | 4101931.0 | 4.044750e+09 |
| 2 | 204026 | 218365 | 2015-09-05T00:00:00.000 | 3 | 0 | OnCurb | Alive | Good | Gleditsia triacanthos var. inermis | honeylocust | ... | 3044900 | New York | 40.717581 | -73.936608 | 1001822.831 | 200716.8913 | 34.0 | 449.0 | 3338310.0 | 3.028870e+09 |
| 3 | 204337 | 217969 | 2015-09-05T00:00:00.000 | 10 | 0 | OnCurb | Alive | Good | Gleditsia triacanthos var. inermis | honeylocust | ... | 3044900 | New York | 40.713537 | -73.934456 | 1002420.358 | 199244.2531 | 34.0 | 449.0 | 3338342.0 | 3.029250e+09 |
| 4 | 189565 | 223043 | 2015-08-30T00:00:00.000 | 21 | 0 | OnCurb | Alive | Good | Tilia americana | American linden | ... | 3016500 | New York | 40.666778 | -73.975979 | 990913.775 | 182202.4260 | 39.0 | 165.0 | 3025654.0 | 3.010850e+09 |
5 rows × 45 columns
data.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683788 entries, 0 to 683787
Data columns (total 45 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 tree_id 683788 non-null int64
1 block_id 683788 non-null int64
2 created_at 683788 non-null object
3 tree_dbh 683788 non-null int64
4 stump_diam 683788 non-null int64
5 curb_loc 683788 non-null object
6 status 683788 non-null object
7 health 652172 non-null object
8 spc_latin 652169 non-null object
9 spc_common 652169 non-null object
10 steward 164350 non-null object
11 guards 79866 non-null object
12 sidewalk 652172 non-null object
13 user_type 683788 non-null object
14 problems 225844 non-null object
15 root_stone 683788 non-null object
16 root_grate 683788 non-null object
17 root_other 683788 non-null object
18 trunk_wire 683788 non-null object
19 trnk_light 683788 non-null object
20 trnk_other 683788 non-null object
21 brch_light 683788 non-null object
22 brch_shoe 683788 non-null object
23 brch_other 683788 non-null object
24 address 683788 non-null object
25 postcode 683788 non-null int64
26 zip_city 683788 non-null object
27 community board 683788 non-null int64
28 borocode 683788 non-null int64
29 borough 683788 non-null object
30 cncldist 683788 non-null int64
31 st_assem 683788 non-null int64
32 st_senate 683788 non-null int64
33 nta 683788 non-null object
34 nta_name 683788 non-null object
35 boro_ct 683788 non-null int64
36 state 683788 non-null object
37 latitude 683788 non-null float64
38 longitude 683788 non-null float64
39 x_sp 683788 non-null float64
40 y_sp 683788 non-null float64
41 council district 677269 non-null float64
42 census tract 677269 non-null float64
43 bin 674229 non-null float64
44 bbl 674229 non-null float64
dtypes: float64(8), int64(11), object(26)
memory usage: 234.8+ MB
data.describe()| tree_id | block_id | tree_dbh | stump_diam | postcode | community board | borocode | cncldist | st_assem | st_senate | boro_ct | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 683788.000000 | 683788.000000 | 683788.000000 | 683788.000000 | 683788.000000 | 683788.000000 | 683788.000000 | 683788.000000 | 683788.000000 | 683788.000000 | 6.837880e+05 | 683788.000000 | 683788.000000 | 6.837880e+05 | 683788.000000 | 677269.000000 | 677269.000000 | 6.742290e+05 | 6.742290e+05 |
| mean | 365205.011085 | 313793.096236 | 11.279787 | 0.432463 | 10916.246044 | 343.505404 | 3.358500 | 29.943181 | 50.791583 | 20.615781 | 3.404914e+06 | 40.701261 | -73.924060 | 1.005280e+06 | 194798.424625 | 30.027330 | 11957.368422 | 3.495439e+06 | 3.413414e+09 |
| std | 208122.092902 | 114839.024312 | 8.723042 | 3.290241 | 651.553364 | 115.740601 | 1.166746 | 14.328531 | 18.966520 | 7.390844 | 1.175863e+06 | 0.090311 | 0.123583 | 3.428505e+04 | 32902.061114 | 14.301717 | 30745.739811 | 1.193275e+06 | 1.174892e+09 |
| min | 3.000000 | 100002.000000 | 0.000000 | 0.000000 | 83.000000 | 101.000000 | 1.000000 | 1.000000 | 23.000000 | 10.000000 | 1.000201e+06 | 40.498466 | -74.254965 | 9.133493e+05 | 120973.792200 | 1.000000 | 1.000000 | 1.000000e+06 | 0.000000e+00 |
| 25% | 186582.750000 | 221556.000000 | 4.000000 | 0.000000 | 10451.000000 | 302.000000 | 3.000000 | 19.000000 | 33.000000 | 14.000000 | 3.011700e+06 | 40.631928 | -73.980500 | 9.896578e+05 | 169515.153700 | 19.000000 | 202.000000 | 3.031991e+06 | 3.011240e+09 |
| 50% | 366214.500000 | 319967.000000 | 9.000000 | 0.000000 | 11214.000000 | 402.000000 | 4.000000 | 30.000000 | 52.000000 | 21.000000 | 4.008100e+06 | 40.700612 | -73.912911 | 1.008386e+06 | 194560.252500 | 30.000000 | 516.000000 | 4.020352e+06 | 4.008560e+09 |
| 75% | 546170.250000 | 404624.000000 | 16.000000 | 0.000000 | 11365.000000 | 412.000000 | 4.000000 | 43.000000 | 64.000000 | 25.000000 | 4.103202e+06 | 40.762228 | -73.834910 | 1.029991e+06 | 217019.571950 | 43.000000 | 1417.000000 | 4.263123e+06 | 4.105700e+09 |
| max | 722694.000000 | 999999.000000 | 450.000000 | 140.000000 | 11697.000000 | 503.000000 | 5.000000 | 51.000000 | 87.000000 | 36.000000 | 5.032300e+06 | 40.912918 | -73.700488 | 1.067248e+06 | 271894.092100 | 51.000000 | 157903.000000 | 5.515124e+06 | 5.080500e+09 |
Count missing values:
Missing values. Table:
data_na = data.isna().sum()data_na = data_na.loc[data_na != 0]data_nahealth 31616
spc_latin 31619
spc_common 31619
steward 519438
guards 603922
sidewalk 31616
problems 457944
council district 6519
census tract 6519
bin 9559
bbl 9559
dtype: int64
print("Missing values, %")
print(round(data_na / 683788 * 100, 4))Missing values, %
health 4.6237
spc_latin 4.6241
spc_common 4.6241
steward 75.9648
guards 88.3201
sidewalk 4.6237
problems 66.9716
council district 0.9534
census tract 0.9534
bin 1.3979
bbl 1.3979
dtype: float64
Missing values. Plot:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")data_na = round(data_na / 683788 * 100, 4)data_na = data_na.sort_values(ascending=False)plt.figure(figsize=(12, 6))
plt.xticks(rotation=45)
plt.xlabel("Columns")
plt.ylabel("Missing values, %")
sns.barplot(data_na).set_title("Columns with missing values")Text(0.5, 1.0, 'Columns with missing values')

Data correlation analysis:
Impute missing values:
from sklearn.impute import SimpleImputerimputer = SimpleImputer()data.select_dtypes(include=("int", "float"))| tree_id | block_id | tree_dbh | stump_diam | postcode | community board | borocode | cncldist | st_assem | st_senate | boro_ct | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 180683 | 348711 | 3 | 0 | 11375 | 406 | 4 | 29 | 28 | 16 | 4073900 | 40.723092 | -73.844215 | 1.027431e+06 | 202756.7687 | 29.0 | 739.0 | 4052307.0 | 4.022210e+09 |
| 1 | 200540 | 315986 | 21 | 0 | 11357 | 407 | 4 | 19 | 27 | 11 | 4097300 | 40.794111 | -73.818679 | 1.034456e+06 | 228644.8374 | 19.0 | 973.0 | 4101931.0 | 4.044750e+09 |
| 2 | 204026 | 218365 | 3 | 0 | 11211 | 301 | 3 | 34 | 50 | 18 | 3044900 | 40.717581 | -73.936608 | 1.001823e+06 | 200716.8913 | 34.0 | 449.0 | 3338310.0 | 3.028870e+09 |
| 3 | 204337 | 217969 | 10 | 0 | 11211 | 301 | 3 | 34 | 53 | 18 | 3044900 | 40.713537 | -73.934456 | 1.002420e+06 | 199244.2531 | 34.0 | 449.0 | 3338342.0 | 3.029250e+09 |
| 4 | 189565 | 223043 | 21 | 0 | 11215 | 306 | 3 | 39 | 44 | 21 | 3016500 | 40.666778 | -73.975979 | 9.909138e+05 | 182202.4260 | 39.0 | 165.0 | 3025654.0 | 3.010850e+09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 683783 | 155433 | 217978 | 25 | 0 | 11211 | 301 | 3 | 34 | 53 | 18 | 3051900 | 40.713211 | -73.954944 | 9.967407e+05 | 199121.6363 | 34.0 | 519.0 | 3062513.0 | 3.023690e+09 |
| 683784 | 183795 | 348185 | 7 | 0 | 11375 | 406 | 4 | 29 | 28 | 15 | 4070700 | 40.715194 | -73.856650 | 1.023989e+06 | 199873.6475 | 29.0 | 707.0 | 4075448.0 | 4.031810e+09 |
| 683785 | 166161 | 401670 | 12 | 0 | 10314 | 501 | 5 | 50 | 63 | 24 | 5020100 | 40.620762 | -74.136517 | 9.463514e+05 | 165466.0763 | 50.0 | 201.0 | 5011657.0 | 5.004080e+09 |
| 683786 | 184028 | 504204 | 9 | 0 | 10457 | 205 | 2 | 15 | 86 | 33 | 2023502 | 40.850828 | -73.903115 | 1.011054e+06 | 249271.9507 | 15.0 | 23502.0 | 2007757.0 | 2.028120e+09 |
| 683787 | 200607 | 306527 | 23 | 0 | 11365 | 408 | 4 | 24 | 25 | 11 | 4134100 | 40.732165 | -73.787526 | 1.043136e+06 | 206095.5383 | 24.0 | 1341.0 | 4153657.0 | 4.071360e+09 |
683788 rows × 19 columns
data_numerical = data.select_dtypes(include=("int", "float")).isnull().sum()data_numericaltree_id 0
block_id 0
tree_dbh 0
stump_diam 0
postcode 0
community board 0
borocode 0
cncldist 0
st_assem 0
st_senate 0
boro_ct 0
latitude 0
longitude 0
x_sp 0
y_sp 0
council district 6519
census tract 6519
bin 9559
bbl 9559
dtype: int64
imputed_df = pd.DataFrame(
data=imputer.fit_transform(data.select_dtypes(include=("int", "float"))),
columns=data.select_dtypes(include=("int", "float")).columns,
)imputed_df.isnull().sum()tree_id 0
block_id 0
tree_dbh 0
stump_diam 0
postcode 0
community board 0
borocode 0
cncldist 0
st_assem 0
st_senate 0
boro_ct 0
latitude 0
longitude 0
x_sp 0
y_sp 0
council district 0
census tract 0
bin 0
bbl 0
dtype: int64
Data correlation table:
data.select_dtypes(include=("int", "float")).corr()| tree_id | block_id | tree_dbh | stump_diam | postcode | community board | borocode | cncldist | st_assem | st_senate | boro_ct | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| tree_id | 1.000000 | 0.114649 | 0.098413 | 0.012563 | 0.199660 | 0.235266 | 0.226142 | 0.149115 | -0.209727 | -0.231528 | 0.229696 | -0.131784 | 0.137696 | 0.137675 | -0.131558 | 0.146893 | 0.046812 | 0.237478 | 0.231893 |
| block_id | 0.114649 | 1.000000 | 0.002445 | 0.003633 | -0.074176 | 0.359408 | 0.364807 | 0.063857 | 0.206664 | 0.193345 | 0.358776 | 0.142741 | -0.009611 | -0.010409 | 0.143108 | 0.059914 | 0.078268 | 0.342150 | 0.363312 |
| tree_dbh | 0.098413 | 0.002445 | 1.000000 | -0.169963 | 0.099781 | 0.100037 | 0.094372 | 0.057056 | -0.144429 | -0.133569 | 0.097330 | -0.034252 | 0.093162 | 0.093166 | -0.034172 | 0.057132 | 0.036964 | 0.097007 | 0.095945 |
| stump_diam | 0.012563 | 0.003633 | -0.169963 | 1.000000 | 0.040682 | 0.022461 | 0.020702 | 0.004629 | -0.046938 | -0.040944 | 0.021716 | -0.004690 | 0.036701 | 0.036707 | -0.004663 | 0.004276 | 0.007326 | 0.022184 | 0.021428 |
| postcode | 0.199660 | -0.074176 | 0.099781 | 0.040682 | 1.000000 | 0.324896 | 0.309710 | 0.169539 | -0.674876 | -0.648751 | 0.323152 | -0.106493 | 0.492576 | 0.492905 | -0.106246 | 0.166506 | 0.114339 | 0.340281 | 0.315693 |
| community board | 0.235266 | 0.359408 | 0.100037 | 0.022461 | 0.324896 | 1.000000 | 0.999230 | 0.679708 | -0.543515 | -0.593410 | 0.999150 | -0.627760 | -0.171519 | -0.171635 | -0.627284 | 0.677678 | 0.176613 | 0.996748 | 0.999631 |
| borocode | 0.226142 | 0.364807 | 0.094372 | 0.020702 | 0.309710 | 0.999230 | 1.000000 | 0.676715 | -0.529001 | -0.580983 | 0.999268 | -0.624571 | -0.191089 | -0.191214 | -0.624108 | 0.674621 | 0.169195 | 0.995559 | 0.999541 |
| cncldist | 0.149115 | 0.063857 | 0.057056 | 0.004629 | 0.169539 | 0.679708 | 0.676715 | 1.000000 | -0.139636 | -0.194772 | 0.670272 | -0.885264 | -0.554068 | -0.553739 | -0.885337 | 0.999771 | -0.018487 | 0.670717 | 0.672331 |
| st_assem | -0.209727 | 0.206664 | -0.144429 | -0.046938 | -0.674876 | -0.543515 | -0.529001 | -0.139636 | 1.000000 | 0.932196 | -0.544253 | 0.223407 | -0.489973 | -0.490305 | 0.222944 | -0.137477 | -0.239845 | -0.562343 | -0.537014 |
| st_senate | -0.231528 | 0.193345 | -0.133569 | -0.040944 | -0.648751 | -0.593410 | -0.580983 | -0.194772 | 0.932196 | 1.000000 | -0.595534 | 0.278002 | -0.439158 | -0.439504 | 0.277535 | -0.192079 | -0.224303 | -0.611019 | -0.587548 |
| boro_ct | 0.229696 | 0.358776 | 0.097330 | 0.021716 | 0.323152 | 0.999150 | 0.999268 | 0.670272 | -0.544253 | -0.595534 | 1.000000 | -0.617911 | -0.169691 | -0.169818 | -0.617434 | 0.668110 | 0.183375 | 0.996119 | 0.999326 |
| latitude | -0.131784 | 0.142741 | -0.034252 | -0.004690 | -0.106493 | -0.627760 | -0.624571 | -0.885264 | 0.223407 | 0.278002 | -0.617911 | 1.000000 | 0.572289 | 0.571812 | 0.999999 | -0.886017 | -0.014834 | -0.628894 | -0.626599 |
| longitude | 0.137696 | -0.009611 | 0.093162 | 0.036701 | 0.492576 | -0.171519 | -0.191089 | -0.554068 | -0.489973 | -0.439158 | -0.169691 | 0.572289 | 1.000000 | 0.999999 | 0.572757 | -0.557000 | 0.220531 | -0.152223 | -0.178745 |
| x_sp | 0.137675 | -0.010409 | 0.093166 | 0.036707 | 0.492905 | -0.171635 | -0.191214 | -0.553739 | -0.490305 | -0.439504 | -0.169818 | 0.571812 | 0.999999 | 1.000000 | 0.572280 | -0.556669 | 0.220356 | -0.152314 | -0.178864 |
| y_sp | -0.131558 | 0.143108 | -0.034172 | -0.004663 | -0.106246 | -0.627284 | -0.624108 | -0.885337 | 0.222944 | 0.277535 | -0.617434 | 0.999999 | 0.572757 | 0.572280 | 1.000000 | -0.886092 | -0.014404 | -0.628408 | -0.626121 |
| council district | 0.146893 | 0.059914 | 0.057132 | 0.004276 | 0.166506 | 0.677678 | 0.674621 | 0.999771 | -0.137477 | -0.192079 | 0.668110 | -0.886017 | -0.557000 | -0.556669 | -0.886092 | 1.000000 | -0.018348 | 0.670766 | 0.672369 |
| census tract | 0.046812 | 0.078268 | 0.036964 | 0.007326 | 0.114339 | 0.176613 | 0.169195 | -0.018487 | -0.239845 | -0.224303 | 0.183375 | -0.014834 | 0.220531 | 0.220356 | -0.014404 | -0.018348 | 1.000000 | 0.183009 | 0.174843 |
| bin | 0.237478 | 0.342150 | 0.097007 | 0.022184 | 0.340281 | 0.996748 | 0.995559 | 0.670717 | -0.562343 | -0.611019 | 0.996119 | -0.628894 | -0.152223 | -0.152314 | -0.628408 | 0.670766 | 0.183009 | 1.000000 | 0.996809 |
| bbl | 0.231893 | 0.363312 | 0.095945 | 0.021428 | 0.315693 | 0.999631 | 0.999541 | 0.672331 | -0.537014 | -0.587548 | 0.999326 | -0.626599 | -0.178745 | -0.178864 | -0.626121 | 0.672369 | 0.174843 | 0.996809 | 1.000000 |
Data correlation heatmap:
sns.heatmap(imputed_df.corr())
Plot tree locations:
imputed_df| tree_id | block_id | tree_dbh | stump_diam | postcode | community board | borocode | cncldist | st_assem | st_senate | boro_ct | latitude | longitude | x_sp | y_sp | council district | census tract | bin | bbl | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 180683.0 | 348711.0 | 3.0 | 0.0 | 11375.0 | 406.0 | 4.0 | 29.0 | 28.0 | 16.0 | 4073900.0 | 40.723092 | -73.844215 | 1.027431e+06 | 202756.7687 | 29.0 | 739.0 | 4052307.0 | 4.022210e+09 |
| 1 | 200540.0 | 315986.0 | 21.0 | 0.0 | 11357.0 | 407.0 | 4.0 | 19.0 | 27.0 | 11.0 | 4097300.0 | 40.794111 | -73.818679 | 1.034456e+06 | 228644.8374 | 19.0 | 973.0 | 4101931.0 | 4.044750e+09 |
| 2 | 204026.0 | 218365.0 | 3.0 | 0.0 | 11211.0 | 301.0 | 3.0 | 34.0 | 50.0 | 18.0 | 3044900.0 | 40.717581 | -73.936608 | 1.001823e+06 | 200716.8913 | 34.0 | 449.0 | 3338310.0 | 3.028870e+09 |
| 3 | 204337.0 | 217969.0 | 10.0 | 0.0 | 11211.0 | 301.0 | 3.0 | 34.0 | 53.0 | 18.0 | 3044900.0 | 40.713537 | -73.934456 | 1.002420e+06 | 199244.2531 | 34.0 | 449.0 | 3338342.0 | 3.029250e+09 |
| 4 | 189565.0 | 223043.0 | 21.0 | 0.0 | 11215.0 | 306.0 | 3.0 | 39.0 | 44.0 | 21.0 | 3016500.0 | 40.666778 | -73.975979 | 9.909138e+05 | 182202.4260 | 39.0 | 165.0 | 3025654.0 | 3.010850e+09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 683783 | 155433.0 | 217978.0 | 25.0 | 0.0 | 11211.0 | 301.0 | 3.0 | 34.0 | 53.0 | 18.0 | 3051900.0 | 40.713211 | -73.954944 | 9.967407e+05 | 199121.6363 | 34.0 | 519.0 | 3062513.0 | 3.023690e+09 |
| 683784 | 183795.0 | 348185.0 | 7.0 | 0.0 | 11375.0 | 406.0 | 4.0 | 29.0 | 28.0 | 15.0 | 4070700.0 | 40.715194 | -73.856650 | 1.023989e+06 | 199873.6475 | 29.0 | 707.0 | 4075448.0 | 4.031810e+09 |
| 683785 | 166161.0 | 401670.0 | 12.0 | 0.0 | 10314.0 | 501.0 | 5.0 | 50.0 | 63.0 | 24.0 | 5020100.0 | 40.620762 | -74.136517 | 9.463514e+05 | 165466.0763 | 50.0 | 201.0 | 5011657.0 | 5.004080e+09 |
| 683786 | 184028.0 | 504204.0 | 9.0 | 0.0 | 10457.0 | 205.0 | 2.0 | 15.0 | 86.0 | 33.0 | 2023502.0 | 40.850828 | -73.903115 | 1.011054e+06 | 249271.9507 | 15.0 | 23502.0 | 2007757.0 | 2.028120e+09 |
| 683787 | 200607.0 | 306527.0 | 23.0 | 0.0 | 11365.0 | 408.0 | 4.0 | 24.0 | 25.0 | 11.0 | 4134100.0 | 40.732165 | -73.787526 | 1.043136e+06 | 206095.5383 | 24.0 | 1341.0 | 4153657.0 | 4.071360e+09 |
683788 rows × 19 columns
locations = list(zip(imputed_df.latitude, imputed_df.longitude))import folium
m = folium.Map(
location=[imputed_df["latitude"].mean(), imputed_df["longitude"].mean()],
zoom_start=18,
)The dataframe has 683788 rows. It’s too large for plotting. We used 10000 points only.
for i in range(len(locations[:10000])):
folium.CircleMarker(location=locations[i], radius=1).add_to(m)
mMake this Notebook Trusted to load map: File -> Trust Notebook